This R Markdown document reproduces the main analyses and figures for tumor evolution in high-clonality LUAD (Lung Adenocarcinoma) dataset from the Sherlock-Lung study
manuscript title: Deciphering lung adenocarcinoma evolution and the role of LINE-1 retrotransposition
Fig. 2: Features associated with lung tumor latency.
(You may need to install some packages if missing)
library(tidyverse)
library(ggplot2)
library(ggsankey)
library(scales)
library(hrbrthemes) # For theme_ipsum_rc
library(cowplot)
library(forcats)
library(ggrepel)
library(ggnewscale)
library(data.table)
library(ggasym)
library(ggpmisc)
library(broom)
library(ggsci)
library(ggpubr)
# --- Define Helper Functions (if any custom) -----------------------------------
# Please source your custom functions here if needed, or place them in ./functions/
# source('./functions/your_custom_functions.R')
# --- Load Input Datasets ------------------------------------------------------
# All data files should be placed in the appropriate subfolders.
# Example folder structure:
# ./data/ - for main input files
# ./functions/ - for custom R functions
# ./output/ - for saving plots and results
# Replace the filenames below with your actual dataset filenames.
load('./data/BBsolution_final3_short.RData',verbose = T)
## Loading objects:
## BBsolution
## BBsolution4
## hq_samples
## wgs_groups_info
## wgs_groups_info2
load('./data/sp_group_data.RData',verbose = T)
## Loading objects:
## sp_group_color
## sp_group_color_new
## sp_group_data
## sp_group_data2
load('./data/covdata0.RData',verbose = T)
## Loading objects:
## covdata0
load('./data/clinical_data.RData',verbose = T)
## Loading objects:
## clinical_data
load('./data/sherlock_data_all.RData',verbose = T)
## Loading objects:
## sherlock_data
## sherlock_data_full
## sherlock_overall
## sherlock_type_colors
## sherlock_freq
## sherlock_freq_hq
load('./data/sherlock_variable.RData',verbose = T)
## Loading objects:
## sherlock_variable
load('./data/sp_group_data.RData',verbose = T)
## Loading objects:
## sp_group_color
## sp_group_color_new
## sp_group_data
## sp_group_data2
load('./data/id2data.RData',verbose = T)
## Loading objects:
## id2data
## id2color
load('./data/tedata.RData',verbose = T)
## Loading objects:
## tedata
load('./data/RNASeq_Exp.RData',verbose = T)
## Loading objects:
## cdata3
## rdata1
## adata
load('./data/suvdata.RData',verbose = T)
## Loading objects:
## suvdata
# load function -----------------------------------------------------------
load('./data/ZTW_functions.RData',verbose = T)
## Loading objects:
## barcode2spg
## sp_group_color
## sp_group_color_new
## sp_group_lift
## sp_group_data
## sp_group_major_new
## sp_group_major
# load analysis related data set
load('./data/Chronological_timing.RData',verbose = T)
## Loading objects:
## mrate
## MRCAdata
## subclonesTimeAbs
## subclonesTimeAbsLinear
## finalPloidy
## finalPower
## effGenome
## finalBB
## finalSnv
## finalClusters
## remove
## branchDeam
## qRateDeam
## rateDeam
## timingclass_groups
## timingInfo
tmp <- colnames(MRCAdata)
MRCAdata <- MRCAdata %>% select(-SP_Group) %>% left_join(sp_group_data2) %>% select(-SP_Group_New) %>% select(one_of(tmp))
## analysis limited to luad only
hq_samples2 <- covdata0 %>% filter(Histology == 'Adenocarcinoma', Tumor_Barcode %in% hq_samples) %>% pull(Tumor_Barcode)
rm(list=c('hq_samples'))
# Latency difference among all genome alteration or features
tdata <- MRCAdata %>% filter(!is.na(Latency)) %>% select(Tumor_Barcode,Acc=acceleration,Latency) #Histology == "Adenocarcinoma"; acceleration == '1x'
#tdata <- MRCAdata %>% filter(!is.na(Latency),acceleration == '1x',SP_Group %in% c('N_A','N_U')) %>% select(Tumor_Barcode,Latency) #Histology == "Adenocarcinoma"
## wicox test ##
tdata <- sherlock_data_full %>%
filter(!(Type %in% c('Signature_Denovo'))) %>%
filter(Tumor_Barcode %in% tdata$Tumor_Barcode) %>%
mutate(Gene=paste0(Gene,"|",Type)) %>%
select(Tumor_Barcode,Gene,Alteration) %>%
left_join(tdata) %>%
#left_join(freq_mrca %>% dplyr::rename(Gene=name)) %>%
left_join(sp_group_data2)
#filter(Freq>0.03)
tdata_all <- bind_rows(
tdata,
tdata %>% mutate(SP_Group='ALL',SP_Group_New='ALL'),
tdata %>% filter(SP_Group_New %in% c('EU_N','AS_N')) %>% mutate(SP_Group='Non-smokers',SP_Group_New='Non-smokers')
)
#tdata <- tdata %>% filter(Freq > 0.03)
tresult_all <- tdata_all %>%
group_by(SP_Group_New,Acc,Gene) %>%
do(tresult = safely(wilcox.test)(Latency ~ Alteration, data=. )) %>%
mutate(tresult_null = map_lgl(tresult['result'], is.null)) %>%
filter(!tresult_null) %>%
mutate(fit = list(tidy(tresult[['result']]))) %>%
select(SP_Group_New,Acc,Gene,fit) %>%
unnest(cols = c(fit)) %>%
ungroup() %>%
arrange(p.value) %>%
mutate(TMP=Gene) %>%
separate(col = 'TMP',into = c('Gene_short','Type'),sep = '\\|') %>%
ungroup()
# add diff
tdata_all <- tdata_all %>% mutate(Alteration = case_when(
Alteration %in% c("Female","Y","Yes","Non-Smoker","WGD") ~ "Yes",
Alteration %in% c("Male","N","No","Smoker","nWGD") ~ "No",
TRUE ~ NA_character_
))
tmp <- tdata_all %>% filter(!is.na(Alteration)) %>% group_by(SP_Group_New,Acc,Gene,Alteration) %>% summarise(value=median(Latency,na.rm = T)) %>% ungroup() %>% pivot_wider(names_from = 'Alteration',values_from = value) %>% mutate(diff=Yes-No)
# add freq
tmpsize <- tdata_all %>% drop_na() %>% select(SP_Group_New,Acc,Tumor_Barcode,Gene) %>% unique() %>% group_by(SP_Group_New,Acc) %>% count(Gene) %>% dplyr::rename(size=n)
freq_mrca <- tdata_all %>% drop_na() %>% count(SP_Group_New,Acc,Gene,Alteration) %>% filter(!is.na(Alteration)) %>% left_join(tmpsize)%>% mutate(Freq=n/size) %>% group_by(SP_Group_New,Acc,Gene) %>% arrange(Freq) %>% dplyr::slice(1) %>% ungroup() %>% select(SP_Group_New,Acc,Gene,Freq) %>% mutate(Freq=if_else(Freq==1,0,Freq))
# tmpsize <- mdata %>% select(Tumor_Barcode,name) %>% unique() %>% count(name)
# freq_mrca <- mdata %>% count(name,value) %>% filter(!is.na(value)) %>% left_join(tmpsize)%>% mutate(Freq=n/size) %>% group_by(name) %>% arrange(Freq) %>% dplyr::slice(1) %>% ungroup() %>% select(name,Freq) %>% mutate(Freq=if_else(Freq==1,0,Freq))
tresult_all <- tresult_all %>% left_join(tmpsize) %>% left_join(tmp) %>% left_join(freq_mrca)
#saveRDS(tresult_all,file='latency_tresult.RDS')
tdata_all <- tdata_all %>%
mutate(TMP=Gene) %>%
separate(col = 'TMP',into = c('Gene_short','Type'),sep = '\\|')
save(tresult_all,tdata_all,file='latency_tresult.RData')
Associations between tumor latency and the presence of specific mutational signatures. The vertical blue dashed line represents a 5-year latency difference between tumors with and without each mutational signature, and the horizontal line indicates the significance threshold (FDR<0.05).
# Signatures Only
load('./data/latency_tresult.RData')
resulttmp <- tresult_all %>%
filter(Type=='Signature_Cosmic_final',!str_detect(Gene,'APOBEC')) %>%
filter(Acc=='1x') %>%
filter(SP_Group_New=='ALL') %>%
filter(Freq>0.01,Gene_short!='SBS5') %>%
mutate(Sig=if_else(str_detect(Gene,'^SBS'),'SBS',if_else(str_detect(Gene,'^DBS'),'DBS',if_else(str_detect(Gene,'^ID'),'ID',NA_character_)))) %>%
mutate(Sig=factor(Sig,levels=c('SBS','ID','DBS'))) %>%
group_by(Sig) %>%
mutate(FDR=p.adjust(p.value)) %>%
ungroup()
#filter(p.value<0.1)
#filter(str_detect(Gene_short,'^SBS'))
datatmp <- tdata_all %>%
filter(Type=='Signature_Cosmic_final') %>%
filter(Acc=='1x') %>%
filter(SP_Group_New=='ALL') %>%
filter(Gene_short %in% resulttmp$Gene_short) %>%
mutate(Sig=if_else(str_detect(Gene,'^SBS'),'SBS',if_else(str_detect(Gene,'^DBS'),'DBS',if_else(str_detect(Gene,'^ID'),'ID',NA_character_)))) %>%
mutate(Sig=factor(Sig,levels=c('SBS','ID','DBS')))
#tmp <- datatmp %>% count(Gene,Gene_short,Type,Alteration) %>% filter(n>5) %>% count(Gene,Gene_short,Type) %>% filter(n==2) %>% pull(Gene)
#datatmp <- datatmp %>% filter(Gene %in% tmp)
#resulttmp <- resulttmp %>% filter(Gene %in% tmp)
tmplevel <- datatmp %>%
filter(Alteration == "Yes") %>%
group_by(Gene_short) %>%
summarise(value=median(Latency)) %>%
arrange(value) %>%
pull(Gene_short)
resulttmp <- resulttmp %>% mutate(Gene_short = factor(Gene_short,levels = tmplevel))
datatmp <- datatmp %>% mutate(Gene_short = factor(Gene_short,levels = tmplevel))
# barplot
p1 <- datatmp %>%
#filter(Gene_short=='ID-Novel-C') %>%
ggplot(aes(Gene_short,Latency,fill=Alteration))+
geom_point(pch=21,size=1,position=position_jitterdodge(jitter.width = 0.1,dodge.width = 0.5),color="black")+
geom_boxplot(width=0.5,outlier.shape = NA,alpha =0.25)+
scale_fill_manual(values = c("Yes" = "#4daf4a","No" = "#cccccc"))+
#scale_fill_manual(values = sigcol)+
theme_ipsum_rc(axis_title_just = 'm',axis_title_size = 14,grid = "Y",grid_col = "gray90",ticks = T)+
theme(axis.text.x = element_text(angle = 30,hjust = 1,vjust = 1),legend.position = 'top',panel.spacing.x = unit(0.1,'cm'))+
labs(x = "", y = 'Latency, years before diagnosis')+
scale_y_continuous(breaks = pretty_breaks())+ #,limits = c(-1.5,32)
#guides(fill="none")+
facet_grid(.~Sig,scales = 'free',space='free')+
panel_border(color = 'black',linetype = 1)
p1
p2 <- resulttmp %>%
ggplot(aes(Gene_short,"1",fill=-log10(p.value)))+
geom_tile()+
facet_grid(.~Sig,scales = 'free',space='free')+
scale_fill_viridis_c(option = "D")+
theme_ipsum_rc(axis_title_just = 'm',axis_title_size = 14,grid = FALSE,grid_col = "gray90",ticks = F)+
theme(axis.text.x = element_blank(),axis.text.y = element_blank(),legend.position = 'top',legend.key.width = unit(1,"cm"),legend.key.height = unit(0.3,"cm"),panel.spacing.x = unit(0.1,'cm'))+
labs(x = "", y = '',fill="Wilcox test, -log10(P)\n")
#guides(fill="none")+
# panel_border(color = 'black',linetype = 1)
p2
#plot_grid(p2,p1,axis = 'v',align = 'lr',ncol = 1,rel_heights = c(1.1,4))
##ggsave(filename = './output/latency_signatures.pdf',width = 16,height = 10,device = cairo_pdf)
## vacanol plot
resulttmp %>%
mutate(Sig=as.factor(Sig)) %>%
ggplot(aes((diff),-log10(FDR),fill=Sig))+
geom_hline(yintercept = -log10(0.05),linetype=2,col='red',size=0.5)+
geom_vline(xintercept = c(-5,5),linetype=2,col='blue',size=0.5)+
geom_point(aes(size=Freq),pch=21,stroke=0.2)+
scale_size_binned(labels = percent_format())+
scale_fill_manual(values = ncicolpal[c(1,3,4)])+
scale_x_continuous(breaks = pretty_breaks())+
ggrepel::geom_text_repel(data=resulttmp,aes(label=Gene_short),max.overlaps = 30,size=3)+
labs(x='Latency Change (Years)',y='-log10(FDR)',size='Frequency',fill='Signature')+
guides(fill = guide_legend(override.aes = list(size=3.5)))+
theme_ipsum_rc(base_size = 12,axis_title_just = 'm',axis_title_size = 14,grid='XY',ticks = T)+
panel_border(color = 'black',size = 0.5)+
coord_cartesian(clip = 'off')
#ggsave(filename = './output/latency_fdr_signatures.pdf',width = 7,height = 5.5,device = cairo_pdf)
Associations between tumor latency and driver gene mutation status. The vertical blue dashed line indicates a 5-year latency difference between driver gene wild type and mutant groups, and the horizontal line represents the significance threshold (FDR<0.05).
# Driver Mutations
#load('latency_tresult.RData')
drglist <- readRDS('./data/drivers_intogene.RDS') %>% pull(symbol)
resulttmp <- tresult_all %>%
filter(Type=='Mutation_Driver',Gene_short %in% drglist) %>%
filter(Acc=='1x') %>%
filter(SP_Group_New=='ALL') %>%
filter(Freq>0.01) %>%
mutate(FDR=p.adjust(p.value))
#filter(p.value<0.1)
#filter(str_detect(Gene_short,'^SBS'))
datatmp <- tdata_all %>%
filter(Type=='Mutation_Driver',Gene_short %in% drglist) %>%
filter(Acc=='1x') %>%
filter(SP_Group_New=='ALL') %>%
filter(Gene_short %in% resulttmp$Gene_short)
#tmp <- datatmp %>% count(Gene,Gene_short,Type,Alteration) %>% filter(n>5) %>% count(Gene,Gene_short,Type) %>% filter(n==2) %>% pull(Gene)
#datatmp <- datatmp %>% filter(Gene %in% tmp)
#resulttmp <- resulttmp %>% filter(Gene %in% tmp)
tmplevel <- datatmp %>%
filter(Alteration == "Yes") %>%
group_by(Gene_short) %>%
summarise(value=median(Latency)) %>%
arrange(value) %>%
pull(Gene_short)
resulttmp <- resulttmp %>% mutate(Gene_short = factor(Gene_short,levels = tmplevel))
datatmp <- datatmp %>% mutate(Gene_short = factor(Gene_short,levels = tmplevel))
## vacanol plot
resulttmp %>%
ggplot(aes((diff),-log10(FDR)))+
geom_hline(yintercept = -log10(0.05),linetype=2,col='red',size=0.5)+
geom_vline(xintercept = c(-5,5),linetype=2,col='blue',size=0.5)+
geom_point(aes(size=Freq),pch=21,stroke=0.1,fill='#3D4551',col='white')+
scale_size_binned(labels = percent_format())+
scale_x_continuous(breaks = pretty_breaks())+
ggrepel::geom_text_repel(data=resulttmp %>% filter(FDR<0.2),aes(label=Gene_short),max.overlaps = 30,size=4)+
labs(x='Latency Change (Years)',y='-log10(FDR)',size='Mutation frequency',fill='Signature')+
guides(fill = guide_legend(override.aes = list(size=3.5)))+
theme_ipsum_rc(base_size = 12,axis_title_just = 'm',axis_title_size = 14,grid='XY',ticks = T)+
theme(legend.position = 'top',legend.key.width = unit(1,'cm'))+
panel_border(color = 'black',size = 0.5)+
coord_cartesian(clip = 'off')
resulttmp %>%
ggplot(aes((diff),-log10(p.value)))+
geom_hline(yintercept = -log10(0.001),linetype=2,col='red',size=0.5)+
geom_hline(yintercept = -log10(0.05),linetype=2,col='orange',size=0.5)+
geom_vline(xintercept = c(-5,5),linetype=2,col='blue',size=0.5)+
geom_point(aes(size=Freq),pch=21,stroke=0.1,fill='#3D4551',col='white')+
scale_size_binned(labels = percent_format())+
scale_x_continuous(breaks = pretty_breaks())+
scale_y_continuous(breaks = pretty_breaks())+
ggrepel::geom_text_repel(data=resulttmp %>% filter(p.value<0.05),aes(label=Gene_short),max.overlaps = 30,size=3.5)+
labs(x='Latency Change (Years)',y='-log10(FDR)',size='Mutation frequency',fill='Signature')+
guides(fill = guide_legend(override.aes = list(size=3.5)))+
theme_ipsum_rc(base_size = 12,axis_title_just = 'm',axis_title_size = 14,grid='XY',ticks = T)+
panel_border(color = 'black',size = 0.5)+
coord_cartesian(clip = 'off')
#ggsave(filename = './output/latency_driverGene_signatures.pdf',width = 5,height = 5.5,device = cairo_pdf)
## for EGFR
tmp <- resulttmp <- tresult_all %>%
filter(Type=='Mutation_Driver',Gene_short %in% 'EGFR') %>%
filter(Acc=='1x') %>%
filter(SP_Group_New %in% c('AS_N','EU_N','EU_S',"Others")) %>%
mutate(label=paste0(SP_Group_New,'\nP=',scientific_format()(p.value))) %>% select(SP_Group_New,label)
tdata_all %>%
filter(Type=='Mutation_Driver',Gene_short %in% 'EGFR') %>%
filter(Acc=='1x') %>%
filter(SP_Group_New %in% c('AS_N','EU_N','EU_S',"Others")) %>%
mutate(Aternation=factor(Alteration,levels=c('Yes','No'))) %>%
left_join(tmp) %>%
ggplot(aes(Alteration,Latency,fill=Alteration))+
geom_point(pch=21,size=2,position=position_jitterdodge(jitter.width = 0.2,dodge.width = 0.5),color="black")+
geom_boxplot(width=0.5,outlier.shape = NA,alpha =0.25)+
scale_fill_manual(values = c("Yes" = "#4daf4a","No" = "#cccccc"))+
#scale_fill_manual(values = sigcol)+
theme_ipsum_rc(axis_title_just = 'm',axis_title_size = 14,grid = "Y",grid_col = "gray90",ticks = T)+
theme(axis.text.x = element_text(angle = 30,hjust = 1,vjust = 1),legend.position = 'bottom',panel.spacing.x = unit(0.1,'cm'),strip.text.x = element_text(hjust = 0.5,face = 2))+
labs(x = "", y = 'Latency, years before diagnosis',fill='EGFR Mutation')+
scale_y_continuous(breaks = pretty_breaks())+ #,limits = c(-1.5,32)
#guides(fill="none")+
facet_grid(.~label,scales = 'free',space='free')+
panel_border(color = 'black',linetype = 1)
#ggsave(filename = './output/latency_driverGene_signatures2.pdf',width = 6,height = 6,device = cairo_pdf)
## for KRAS
tmp <- resulttmp <- tresult_all %>%
filter(Type=='Mutation_Driver',Gene_short %in% 'KRAS') %>%
filter(Acc=='1x') %>%
filter(SP_Group_New %in% c('AS_N','EU_N','EU_S',"Others")) %>%
mutate(label=paste0(SP_Group_New,'\nP=',scientific_format()(p.value))) %>% select(SP_Group_New,label)
tdata_all %>%
filter(Type=='Mutation_Driver',Gene_short %in% 'KRAS') %>%
filter(Acc=='1x') %>%
filter(SP_Group_New %in% c('AS_N','EU_N','EU_S','Others')) %>%
mutate(Aternation=factor(Alteration,levels=c('Yes','No'))) %>%
left_join(tmp) %>%
ggplot(aes(Alteration,Latency,fill=Alteration))+
geom_point(pch=21,size=2,position=position_jitterdodge(jitter.width = 0.2,dodge.width = 0.5),color="black")+
geom_boxplot(width=0.5,outlier.shape = NA,alpha =0.25)+
scale_fill_manual(values = c("Yes" = "#4daf4a","No" = "#cccccc"))+
#scale_fill_manual(values = sigcol)+
theme_ipsum_rc(axis_title_just = 'm',axis_title_size = 14,grid = "Y",grid_col = "gray90",ticks = T)+
theme(axis.text.x = element_text(angle = 30,hjust = 1,vjust = 1),legend.position = 'bottom',panel.spacing.x = unit(0.1,'cm'),strip.text.x = element_text(hjust = 0.5,face = 2))+
labs(x = "", y = 'Latency, years before diagnosis',fill='KRAS Mutation')+
scale_y_continuous(breaks = pretty_breaks())+ #,limits = c(-1.5,32)
#guides(fill="none")+
facet_grid(.~label,scales = 'free',space='free')+
panel_border(color = 'black',linetype = 1)
#ggsave(filename = './output/latency_driverGene_signatures_kras.pdf',width = 6,height = 6,device = cairo_pdf)
# # example of regression
# tdata_all %>%
# filter(Type=='Mutation_Driver',Gene_short %in% 'EGFR') %>%
# filter(Acc=='1x') %>%
# left_join(covdata0) %>%
# filter(SP_Group_New %in% c('AS_N','EU_N','EU_S')) %>%
# group_by(SP_Group_New) %>%
# do(tidy(lm(Latency~Alteration+Gender+Histology+Tumor_Purity,data=.))) %>%
# filter(term=='AlterationYes') %>%
# arrange(p.value)
# #do(tidy(t.test(Latency~Alteration,data=.)))
Figure 2b: Forest plot illustrating associations between tumor latency and EGFR mutation status, adjusted for sex and tumor purity (percentage of cancer cells within a tumor sample). P-values and regression coefficients with 95% confidence intervals (CIs) are provided for each variable. Significant associations are in red.
Figure 2c: Box plots displaying estimated tumor latency separated by EGFR mutation status and sex.
tmp <- sherlock_data_full %>%
filter(Type=='Mutation_Driver',Gene=='EGFR') %>%
select(Tumor_Barcode,EGFR=Alteration)
tdata <- MRCAdata %>%
filter(Tumor_Barcode %in% hq_samples2) %>%
filter(acceleration=='1x', Tumor_Barcode %in% hq_samples2) %>%
left_join(sp_group_data2) %>%
left_join(covdata0 %>% select(Tumor_Barcode,Gender,Tumor_Purity)) %>%
filter(SP_Group_New %in% c('AS_N','EU_N','EU_S','Others')) %>%
left_join(tmp)
tdata %>% do(tidy(wilcox.test(Latency~Gender,data=.)))
## # A tibble: 1 × 4
## statistic p.value method alternative
## <dbl> <dbl> <chr> <chr>
## 1 25393 0.00167 Wilcoxon rank sum test with continuity correcti… two.sided
tmp <- tdata %>%
group_by(SP_Group_New) %>% do(tidy(wilcox.test(Latency~Gender,data=.))) %>%
mutate(label=paste0(SP_Group_New,'\nP=',scientific_format()(p.value))) %>% select(SP_Group_New,label)
tdata %>%
left_join(tmp) %>%
ggplot(aes(Gender,Latency,fill=Gender))+
geom_point(pch=21,size=2,position=position_jitterdodge(jitter.width = 0.2,dodge.width = 0.5),color="black")+
geom_boxplot(width=0.5,outlier.shape = NA,alpha =0.25)+
scale_fill_manual(values = c("Female" = "#4daf4a","Male" = "#cccccc"))+
#scale_fill_manual(values = sigcol)+
theme_ipsum_rc(axis_title_just = 'm',axis_title_size = 14,grid = "Y",grid_col = "gray90",ticks = T)+
#theme(axis.text.x = element_text(angle = 30,hjust = 1,vjust = 1),legend.position = 'top',panel.spacing.x = unit(0.1,'cm'))+
theme(axis.text.x = element_text(angle = 30,hjust = 1,vjust = 1),legend.position = 'bottom',panel.spacing.x = unit(0.1,'cm'),strip.text.x = element_text(hjust = 0.5,face = 2))+
labs(x = "", y = 'Latency, years before diagnosis',fill='Sex')+
scale_y_continuous(breaks = pretty_breaks())+ #,limits = c(-1.5,32)
facet_grid(.~label,scales = 'free',space='free')+
panel_border(color = 'black',linetype = 1)
#ggsave(filename = './output/latency_gender_signatures.pdf',width = 6,height = 5.8,device = cairo_pdf)
p1 <- tdata %>%
filter(SP_Group_New == 'EU_N') %>%
ggplot(aes(Gender,Latency,fill=Gender))+
geom_point(pch=21,size=2,position=position_jitterdodge(jitter.width = 0.2,dodge.width = 0.5),color="black")+
geom_boxplot(width=0.5,outlier.shape = NA,alpha =0.25)+
scale_fill_manual(values = c("Female" = "#984ea3","Male" = "#cccccc"))+
#scale_fill_manual(values = sigcol)+
theme_ipsum_rc(axis_title_just = 'm',axis_title_size = 14,grid = "Y",grid_col = "gray90",ticks = T)+
theme(axis.text.x = element_text(angle = 30,hjust = 1,vjust = 1),legend.position = 'top',panel.spacing.x = unit(0.1,'cm'))+
labs(x = "", y = 'Latency, years before diagnosis',fill='Sex')+
scale_y_continuous(breaks = pretty_breaks())+ #,limits = c(-1.5,32)
facet_grid(.~EGFR,scales = 'free',space='free')+
panel_border(color = 'black',linetype = 1)
p1
p2 <- tdata %>%
filter(SP_Group_New == 'EU_N') %>%
ggplot(aes(EGFR,Latency,fill=EGFR))+
geom_point(pch=21,size=2,position=position_jitterdodge(jitter.width = 0.2,dodge.width = 0.5),color="black")+
geom_boxplot(width=0.5,outlier.shape = NA,alpha =0.25)+
scale_fill_manual(values = c("Yes" = "#4daf4a","No" = "#cccccc"))+
#scale_fill_manual(values = sigcol)+
theme_ipsum_rc(axis_title_just = 'm',axis_title_size = 14,grid = "Y",grid_col = "gray90",ticks = T)+
theme(axis.text.x = element_text(angle = 30,hjust = 1,vjust = 1),legend.position = 'top',panel.spacing.x = unit(0.1,'cm'))+
labs(x = "", y = 'Latency, years before diagnosis',fill='EGFR Mutation')+
scale_y_continuous(breaks = pretty_breaks())+ #,limits = c(-1.5,32)
facet_grid(.~Gender,scales = 'free',space='free')+
panel_border(color = 'black',linetype = 1)
p2
plot_grid(p1,p2,align = 'v',axis = 'lr')
#ggsave(filename = './output/latency_gender_EGFR_signatures2.pdf',width = 7.5,height = 5.5,device = cairo_pdf)
tdata %>%
filter(SP_Group_New=='EU_N') %>%
group_by(Gender) %>%
do(tidy(wilcox.test(Latency~EGFR,data=.)))
## # A tibble: 2 × 5
## # Groups: Gender [2]
## Gender statistic p.value method alternative
## <fct> <dbl> <dbl> <chr> <chr>
## 1 Male 115 0.694 Wilcoxon rank sum exact test two.sided
## 2 Female 1649 0.000144 Wilcoxon rank sum test with continuity … two.sided
tdata %>%
filter(SP_Group_New=='EU_N') %>%
group_by(EGFR) %>%
do(tidy(wilcox.test(Latency~Gender,data=.)))
## # A tibble: 2 × 5
## # Groups: EGFR [2]
## EGFR statistic p.value method alternative
## <chr> <dbl> <dbl> <chr> <chr>
## 1 No 606 0.332 Wilcoxon rank sum test with continuity co… two.sided
## 2 Yes 225 0.00275 Wilcoxon rank sum test with continuity co… two.sided
tdata %>%
group_by(SP_Group_New) %>%
do(tidy(wilcox.test(Latency~Gender,data=.)))
## # A tibble: 4 × 5
## # Groups: SP_Group_New [4]
## SP_Group_New statistic p.value method alternative
## <chr> <dbl> <dbl> <chr> <chr>
## 1 AS_N 1899 0.911 Wilcoxon rank sum test with contin… two.sided
## 2 EU_N 1646 0.0103 Wilcoxon rank sum test with contin… two.sided
## 3 EU_S 1012 0.758 Wilcoxon rank sum test with contin… two.sided
## 4 Others 271 0.0510 Wilcoxon rank sum exact test two.sided
tmpresult <- tdata %>%
group_by(SP_Group_New) %>%
do(tidy(lm(Latency~Gender+EGFR+Tumor_Purity,data=.),conf.int = TRUE, conf.level = 0.95)) %>%
filter(term!='(Intercept)') %>%
ungroup() %>%
arrange(p.value) %>%
mutate(term=str_remove(term,"EGFR")) %>%
mutate(term=str_remove(term,"Histology")) %>%
mutate(term=str_remove(term,"Gender"))
tmpresult <- tmpresult %>% mutate(label=paste0('β = ',round(estimate,2), ', p = ',scientific_format(digits = 3)(p.value)))
# %>%
# bind_rows(
# tibble(term=c("No","Male"),label = "",conf.low = 0, conf.high=0,estimate=0,p.value = 1) %>% mutate(SP_Group_New='AS_N'),
# tibble(term=c("No","Male"),label = "",conf.low = 0, conf.high=0,estimate=0,p.value = 1) %>% mutate(SP_Group_New='EU_N'),
# tibble(term=c("No","Male"),label = "",conf.low = 0, conf.high=0,estimate=0,p.value = 1) %>% mutate(SP_Group_New='EU_S')
# )
# tmplevels <- c('Tumor_Purity','Male','Female','No','Yes')
# tmplables <- c('Tumor purity', 'Male','Female','EGFR wildtype','EGFR mutant')
tmplevels <- c('Tumor_Purity','Female','Yes')
tmplables <- c('Tumor purity', 'Female\nRef=Male','EGFR Mutant\nRef=Wildtype')
tmpresult %>%
mutate(term=factor(term,levels=tmplevels,labels=tmplables)) %>%
ggplot(aes(estimate, term, xmin = conf.low, xmax = conf.high, height = 0,color=p.value<0.05)) +
geom_point(pch=19,size=3) +
geom_vline(xintercept = 0, lty = 4) +
geom_errorbarh(height = .3)+
scale_color_manual(values = c("FALSE"='black',"TRUE"='red'))+
ggrepel::geom_text_repel(aes(label=label),size=3.5,nudge_y = 0.2)+
facet_grid(~SP_Group_New)+
theme_ipsum_rc(axis_title_just = 'm',axis_title_size = 14)+
theme(panel.spacing = unit(0.2,"cm"))+
labs(x = "Regression coefficient for tumor latency", y = NULL)+
guides(color="none")+
scale_x_continuous(breaks = pretty_breaks())+
panel_border(color = 'black',linetype = 1)
#ggsave(filename = './output/latency_gender_EGFR_signatures.pdf',width = 14,height = 3,device = cairo_pdf)
A multivariate regression analysis examining the relationship between tumor latency and various factors, including sex, ancestry, smoking, EGFR mutations, KRAS mutations, and mutational signature ID2. Statistically significant associations (P<0.05) are denoted in red.
tmp1 <- sherlock_data_full %>%
filter(Type=='Mutation_Driver',Gene=='EGFR') %>%
select(Tumor_Barcode,EGFR=Alteration)
tmp2 <- sherlock_data_full %>%
filter(Type=='Mutation_Driver',Gene=='KRAS') %>%
select(Tumor_Barcode,KRAS=Alteration)
tmp3 <- sherlock_data_full %>%
filter(Type=='Mutation_Driver',Gene=='TP53') %>%
select(Tumor_Barcode,TP53=Alteration)
tdata <- MRCAdata %>%
filter(Tumor_Barcode %in% hq_samples2) %>%
filter(acceleration=='1x', Tumor_Barcode %in% hq_samples2) %>%
left_join(sp_group_data2) %>%
left_join(covdata0 %>% select(Tumor_Barcode,Gender,Smoking,Assigned_Population,Tumor_Purity)) %>%
left_join(id2data %>% select(Tumor_Barcode, ID2_Present)) %>%
#filter(SP_Group_New %in% c('AS_N','EU_N','EU_S')) %>%
left_join(tmp1) %>%
left_join(tmp2) %>%
left_join(tmp3) %>%
mutate(Smoking = if_else(Smoking == 'Unknown',NA_character_,Smoking)) %>%
mutate(Smoking = factor(Smoking, levels=c('Smoker','Non-Smoker')))
tmpresult <- tdata %>%
do(tidy(lm(Latency~Gender+Assigned_Population+Smoking+ID2_Present+EGFR+KRAS+Tumor_Purity,data=.),conf.int = TRUE, conf.level = 0.95)) %>%
filter(term!='(Intercept)') %>%
ungroup() %>%
arrange(desc(estimate)) %>%
#mutate(term=str_remove(term,"EGFR")) %>%
mutate(term=str_remove(term,"Histology")) %>%
mutate(term=str_remove(term,"Gender")) %>%
mutate(label=paste0('β = ',round(estimate,2), ', p = ',scientific_format(digits = 3)(p.value)))
# tmpresult <- tmpresult %>% mutate(label=paste0('β = ',round(estimate,2), ', p = ',scientific_format(digits = 3)(p.value))) %>%
# bind_rows(
# tibble(term=c("No","Male"),label = "",conf.low = 0, conf.high=0,estimate=0,p.value = 1) %>% mutate(SP_Group_New='AS_N'),
# tibble(term=c("No","Male"),label = "",conf.low = 0, conf.high=0,estimate=0,p.value = 1) %>% mutate(SP_Group_New='EU_N'),
# tibble(term=c("No","Male"),label = "",conf.low = 0, conf.high=0,estimate=0,p.value = 1) %>% mutate(SP_Group_New='EU_S')
# )
# tmplevels <- c('Tumor_Purity','Male','Female','No','Yes')
#
# tmplables <- c('Tumor purity', 'Male','Female','EGFR wildtype','EGFR mutant')
tmplevels <- tmpresult$term
tmplables <- tmpresult$term
#tmplables <- c('EGFR Mutant\nRef=Wildtype','Female\nRef=Male','Smokers\nRef=Nonsmokers','Tumor Purity','Ancestry EAS\nRef=EUR','Ancestry Others\nRef=EUR','KRAS Mutant\nRef=Wildtype','ID2 Signature Present\nRef=Absent')
tmplables <- c('EGFR Mutant\nRef=Wildtype','Female\nRef=Male','Tumor Purity','Ancestry EAS\nRef=EUR','Nonsmokers\nRef=Smokers','Ancestry Others\nRef=EUR','KRAS Mutant\nRef=Wildtype','ID2 Signature Present\nRef=Absent')
tmpresult %>%
mutate(p.value = if_else(p.value>0.05,0.5,p.value)) %>%
mutate(term=factor(term,levels=tmplevels,labels=tmplables)) %>%
ggplot(aes(estimate, term, xmin = conf.low, xmax = conf.high, height = 0,color=-log10(p.value))) +
geom_vline(xintercept = 0, lty = 5) +
geom_errorbarh(height = .3)+
geom_point(pch=19,size=4) +
#scale_color_manual(values = c("FALSE"='black',"TRUE"='red'))+
scale_color_material(palette = 'red')+
ggrepel::geom_text_repel(aes(label=label),size=3.5,nudge_y = 0.4)+
#facet_grid(~SP_Group_New)+
theme_ipsum_rc(axis_title_just = 'm',axis_title_size = 14,grid_col = 'gray85')+
theme(panel.spacing = unit(0.2,"cm"))+
labs(x = "Regression coefficient", y = NULL)+
guides(color="none")+
scale_x_continuous(breaks = pretty_breaks())+
panel_border(color = 'black',linetype = 1)
##ggsave(filename = './output/latency_multivariables.pdf',width = 8,height = 5,device = cairo_pdf)
summary(lm(Latency~Gender+Assigned_Population+Smoking+Tumor_Purity+Age,data=tdata))$r.squared
## [1] 0.06051831
summary(lm(Latency~Gender+Assigned_Population+Smoking+ID2_Present+Tumor_Purity+Age,data=tdata))$r.squared
## [1] 0.08865073
summary(lm(Latency~Gender+Assigned_Population+Smoking+EGFR+KRAS+Tumor_Purity+Age,data=tdata))$r.squared
## [1] 0.08998911
summary(lm(Latency~Gender+Assigned_Population+Smoking+ID2_Present+EGFR+KRAS+Tumor_Purity+Age,data=tdata))$r.squared
## [1] 0.1173433
tdata <- tdata %>% filter(Smoking == 'Non-Smoker',Assigned_Population %in% c('EAS','EUR'))
#summary(lm(Latency~SP_Group_New+ID2_Present + KRAS +EGFR+SP_Group_New:EGFR + Tumor_Purity + Gender,data=tdata)) %>% tidy() %>% arrange(p.value)
summary(lm(Latency~Assigned_Population+EGFR+EGFR:Assigned_Population + Tumor_Purity + Gender,data=tdata)) %>% tidy() %>% arrange(p.value)
## # A tibble: 6 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 EGFRYes 7.73 1.89 4.08 0.0000553
## 2 (Intercept) 10.1 3.24 3.10 0.00205
## 3 GenderFemale 3.70 1.79 2.06 0.0399
## 4 Assigned_PopulationEAS:EGFRYes -4.97 2.82 -1.76 0.0790
## 5 Tumor_Purity 5.11 5.03 1.02 0.310
## 6 Assigned_PopulationEAS 0.850 2.16 0.394 0.694
# Supplementary Fig. 12 ---------------------------------------------------
#Supplementary Fig. 12a, overall
mrate %>%
left_join(sp_group_data) %>%
#filter(Tumor_Barcode %in% hq_samples2) %>%
#filter(SP_Group!="Others") %>%
drop_na() %>%
ggplot(aes(age,log2(CpG.TpG.Gb)))+
geom_point(pch=21,size=2,fill=ncicolpal[14]) +
geom_smooth(method = 'lm')+
#facet_wrap(~SP_Group_New,nrow = 1,scales = 'free_x')+
scale_fill_manual(values = sp_group_color_new)+
theme_ipsum_rc(axis_title_just = 'm',axis_title_size = 14)+
labs(x = "Age at diagnosis (years)", y = "SNVs/Gb (log2)")+
guides(fill="none")+
scale_x_continuous(breaks = pretty_breaks())+
panel_border(color = 'black',linetype = 1)
#ggsave(filename = './output/mutation_rate_age_all.pdf',width = 4,height = 3.6,device = cairo_pdf)
# Supplementary Fig. 12b, seperated by the group
mrate %>%
left_join(sp_group_data) %>%
#filter(SP_Group!="Others") %>%
#filter(Tumor_Barcode %in% hq_samples2) %>%
drop_na() %>%
ggplot(aes(age,log2(CpG.TpG.Gb),fill=SP_Group_New))+
geom_point(pch=21,size=2) +
geom_smooth(method = 'lm')+
facet_wrap(~SP_Group_New,nrow = 1,scales = 'free_x')+
scale_fill_manual(values = sp_group_color_new)+
theme_ipsum_rc(axis_title_just = 'm',axis_title_size = 14)+
labs(x = "Age at diagnosis (years)", y = "SNVs/Gb (log2)")+
guides(fill="none")+
scale_x_continuous(breaks = pretty_breaks())+
panel_border(color = 'black',linetype = 1)
#ggsave(filename = './output/mutation_rate_age.pdf',width = 12,height = 3.5,device = cairo_pdf)
mrate %>% do(tidy(lm(log2(CpG.TpG.Gb)~age,data=.))) %>% filter(term!="(Intercept)")
## # A tibble: 1 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 age 0.0167 0.00389 4.29 0.0000209
mrate %>% group_by(SP_Group) %>% do(tidy(lm(log2(CpG.TpG.Gb)~age,data=.))) %>% filter(term!="(Intercept)") %>% ungroup() %>% arrange(p.value)
## # A tibble: 4 × 6
## SP_Group term estimate std.error statistic p.value
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 N_A age 0.0246 0.00508 4.85 0.00000267
## 2 N_U age 0.0102 0.00485 2.10 0.0372
## 3 Others age 0.0160 0.0124 1.29 0.205
## 4 S_U age -0.00410 0.0104 -0.395 0.694
#mrate %>% left_join(clinical_data) %>% filter(Histology == 'Adenocarcinoma') %>% group_by(SP_Group) %>% do(tidy(lm(log2(CpG.TpG.Gb)~age,data=.))) %>% filter(term!="(Intercept)") %>% ungroup() %>% arrange(p.value)
#sorder <- read_rds('../SmokingVar/sorder.RDS')
#mrate %>% left_join(sorder) %>% filter(!is.na(APOBEC_Signature)) %>% group_by(APOBEC_Signature) %>% do(tidy(lm(CpG.TpG.Gb~age,data=.))) %>% filter(term!="(Intercept)") %>% ungroup() %>% arrange(p.value)
# Supplementary Fi. 13a ---------------------------------------------------
# average rate as barplot
as.data.frame(qRateDeam) %>%
rownames_to_column() %>%
pivot_longer(cols = -rowname) %>%
mutate(rowname = paste0("d",str_remove(rowname,"%"))) %>%
pivot_wider(names_from = rowname,values_from = value) %>%
filter(name!="Others") %>%
dplyr::rename(SP_Group=name) %>%
left_join(sp_group_data) %>%
ggplot(aes(SP_Group_New,d50,fill=SP_Group_New))+
geom_col(width = 0.7)+
scale_fill_manual(values = sp_group_color_new)+
guides(fill="none")+
geom_errorbar(aes(ymin=d0,ymax=d100),width=0.1,size=0.7)+
labs(y = "CpG>TpG rate [SNVs/Gb/yr]", x= "")+
theme_ipsum_rc(axis_title_just = 'm',axis_title_size = 14)+
panel_border(color = 'black',linetype = 1)
#ggsave(filename = './output/mutation_rate_age_average.pdf',width = 3,height = 4,device = cairo_pdf)
# Latency Plot by differnt accelration rate ------------------------------------------------------------
u <- base::setdiff(names(finalSnv), remove)
guessAccel <- sapply(subclonesTimeAbs, function(x) "5x")
qSubclone <- sapply(subclonesTimeAbs, function(x) apply(x[,"hat",][rownames(x)%in%u,,drop=FALSE], 2, quantile, c(0.05,0.25,0.5,0.75,0.95), na.rm=TRUE), simplify='array')
subclonesTimeAbsType <- sapply(names(subclonesTimeAbs), function(n) {x <- subclonesTimeAbs[[n]]; x[,,guessAccel[n]][setdiff(rownames(x),remove), 1:3, drop=FALSE]})
nSubclones <- sapply(subclonesTimeAbsType, function(x) sum(!is.na(x[,1])))
qSubclone <- qSubclone[,,-2]
m <- qSubclone["50%","5x",]#t[1,3,]
names(m) <- dimnames(qSubclone)[[3]]
m[nSubclones < 5] <- NA
o <- rev(order(m, na.last=NA))
n <- dimnames(qSubclone)[[3]]
#cairo_pdf(filename = 'Acceleration_alterative.pdf',width = 3,height = 5)
par(mar=c(3,3,1,1), mgp=c(2,.5,0), tcl=0.25,cex=1, bty="L", xpd=FALSE, las=1)
plot(NA,NA, xlim=c(0.5,length(m[o])+0.5), ylab="Latency [yr]", xlab="", xaxt="n", yaxs="i", ylim=c(0,18))
abline(h=seq(10,20,10), col="#DDDDDD", lty=3)
x <- seq_along(m[o])
mg14::rotatedLabel(x, labels=as.character(sp_group_lift[names(rev(sort(m)))]))
b <- .3
rect(seq_along(o)-b,qSubclone["50%","1x",o],seq_along(o)+b,qSubclone["50%","10x",o], col=alpha(as.character(sp_group_color[n[o]]),0.4), border=1)
rect(seq_along(o)-b,qSubclone["50%","2.5x",o],seq_along(o)+b,qSubclone["50%","7.5x",o], col=alpha(as.character(sp_group_color[n[o]]),0.9), border=1)
rect(seq_along(o)-b,qSubclone["50%","20x",o],seq_along(o)+b,qSubclone["50%","10x",o], col=alpha(as.character(sp_group_color[n[o]]),0.2), border=1)
segments(seq_along(o)-b,qSubclone["50%","5x",o],seq_along(o)+b,qSubclone["50%","5x",o])
#dev.off()
# Tumor latency vs proliferation maker ---------------------
tmp <- rdata1 %>%
filter(RNAseq_Type == 'Tumor', Gene %in% c('MYBL2','BUB1','PLK1','CCNE1','CCNB1','BUB1','FOXM1','TOP2A','MKI67'))
tdata <- MRCAdata %>%
filter(acceleration == "1x") %>%
mutate(Group=if_else(Latency>8,'High','Low')) %>%
left_join(tmp) %>%
left_join(sp_group_data2) %>%
filter(!is.na(Exp),!is.na(Latency))
tdata %>% group_by(Gene) %>% do(tidy(wilcox.test(Exp~Group,data=.))) %>% ungroup() %>% mutate(FDR=p.adjust(p.value))
## # A tibble: 8 × 6
## Gene statistic p.value method alternative FDR
## <chr> <dbl> <dbl> <chr> <chr> <dbl>
## 1 BUB1 10476 0.0109 Wilcoxon rank sum test with cont… two.sided 0.0394
## 2 CCNB1 9769 0.000787 Wilcoxon rank sum test with cont… two.sided 0.00630
## 3 CCNE1 10440 0.00970 Wilcoxon rank sum test with cont… two.sided 0.0394
## 4 FOXM1 10193 0.00410 Wilcoxon rank sum test with cont… two.sided 0.0246
## 5 MKI67 10505 0.0120 Wilcoxon rank sum test with cont… two.sided 0.0394
## 6 MYBL2 10378 0.00787 Wilcoxon rank sum test with cont… two.sided 0.0394
## 7 PLK1 10084 0.00274 Wilcoxon rank sum test with cont… two.sided 0.0192
## 8 TOP2A 10440 0.00970 Wilcoxon rank sum test with cont… two.sided 0.0394
my_comparisons <- list(c("High",'Low'))
tdata%>%
mutate(Group=fct_rev(Group)) %>%
ggplot(aes(Group,Exp,fill=Group))+
geom_boxplot(width=0.7,outlier.shape = NA,alpha =0.25)+
geom_point(pch=21,size=1.5,position = position_jitter(width = 0.15,height = 0.05),color="white",stroke=0.02)+
scale_fill_jama()+
facet_wrap(~Gene,nrow = 1)+
theme_ipsum_rc(axis_title_just = 'm',axis_title_size = 14)+
theme(plot.margin = margin(4,4,4,4),panel.spacing = unit(0.1,'cm'),strip.text.x = element_text(hjust = 0.5,face = 4))+
labs(x = "Tumor latency", y = 'RNA-Seq expression log2(CPM)')+
guides(fill="none")+
panel_border(color = 'black',linetype = 1)+
stat_compare_means(comparisons = my_comparisons)
#ggsave(filename = './output/latency_proliferation_expression.pdf',width = 8,height = 6,device = cairo_pdf)
# TP53 vs Tumor latenc/MRCA age
tdata <- MRCAdata %>%
filter(acceleration == "1x") %>%
mutate(Group=if_else(Latency>8,'High','Low')) %>%
left_join(
sherlock_data_full %>% filter(Gene=='TP53',Type=='Mutation_Driver') %>% select(Tumor_Barcode,TP53=Alteration)
) %>%
left_join(sp_group_data2)
tdata <- tdata %>% mutate(SP_Group_New = 'ALL') %>% bind_rows(tdata)
my_comparisons <- list(c("Yes",'No'))
tdata %>%
ggplot(aes(TP53,MRCA_age,fill=TP53))+
geom_boxplot(width=0.7,outlier.shape = NA,alpha =0.25)+
geom_point(pch=21,size=1.5,position = position_jitter(width = 0.15,height = 0.05),color="white",stroke=0.02)+
scale_fill_jama()+
scale_y_continuous(breaks = pretty_breaks(n = 7))+
facet_wrap(~SP_Group_New,nrow = 1)+
theme_ipsum_rc(axis_title_just = 'm',axis_title_size = 14)+
theme(plot.margin = margin(4,4,4,4),panel.spacing = unit(0.1,'cm'),strip.text.x = element_text(hjust = 0.5,face = 4))+
labs(x = "TP53 mutation", y = 'Tumor MRCA years')+
guides(fill="none")+
panel_border(color = 'black',linetype = 1)+
stat_compare_means(comparisons = my_comparisons)
#ggsave(filename = './output/TP53_MRCA_years.pdf',width = 5,height = 5,device = cairo_pdf)
tmp <- rdata1 %>%
filter(RNAseq_Type == 'Tumor', Gene %in% c('MYBL2','BUB1','PLK1','CCNE1','CCNB1','BUB1','FOXM1','TOP2A','MKI67'))
sherlock_data_full %>%
filter(Gene=='TP53',Type=='Mutation_Driver') %>%
select(Tumor_Barcode,TP53=Alteration) %>%
left_join(tmp) %>%
left_join(sp_group_data2) %>%
filter(Tumor_Barcode %in% hq_samples2,!is.na(Exp)) %>%
ggplot(aes(TP53,Exp,fill=TP53))+
geom_boxplot(width=0.7,outlier.shape = NA,alpha =0.25)+
geom_point(pch=21,size=1.5,position = position_jitter(width = 0.15,height = 0.05),color="white",stroke=0.02)+
scale_fill_jama()+
scale_y_continuous(breaks = pretty_breaks(n = 7))+
facet_wrap(~Gene,nrow = 1)+
theme_ipsum_rc(axis_title_just = 'm',axis_title_size = 14)+
theme(plot.margin = margin(4,4,4,4),panel.spacing = unit(0.1,'cm'),strip.text.x = element_text(hjust = 0.5,face = 4))+
labs(x = "TP53 mutation", y = 'RNA-Seq expression log2(CPM)')+
guides(fill="none")+
panel_border(color = 'black',linetype = 1)
#stat_compare_means(comparisons = my_comparisons)
#ggsave(filename = './output/TP53_proliferation_expression.pdf',width = 9,height = 6,device = cairo_pdf)
# Latency difference between ID2 present vs ID2 absent in KRAS mutant group --------------------------
my_comparisons <- list(c("Absent",'Present'))
MRCAdata %>%
filter(acceleration == '1x') %>%
left_join(id2data) %>%
left_join(
sherlock_data_full %>%
filter(Gene=='KRAS',Type=='Mutation_Driver') %>%
select(Tumor_Barcode,KRAS=Alteration)
) %>%
left_join(sp_group_data2) %>%
filter(KRAS=='Yes',Tumor_Barcode %in% hq_samples2) %>%
filter(SP_Group_New %in% c('EU_N','EU_S')) %>%
ggplot(aes(ID2_Present,Latency,fill=ID2_Present))+
geom_boxplot(width=0.6,outlier.shape = NA,alpha =0.25)+
geom_point(pch=21,size=3,position = position_jitter(width = 0.15,height = 0),color="black",stroke=0.02)+
scale_fill_manual(values = id2color)+
facet_wrap(~SP_Group_New)+
scale_y_continuous(breaks = pretty_breaks(n = 7))+
theme_ipsum_rc(axis_title_just = 'm',axis_title_size = 14)+
theme(plot.margin = margin(4,4,4,4),panel.spacing = unit(0.1,'cm'),strip.text.x = element_text(hjust = 0.5,face = 4))+
labs(x = "Mutational signature ID2", y = 'Tumor latency years')+
guides(fill="none")+
panel_border(color = 'black',linetype = 1)+
stat_compare_means(comparisons = my_comparisons,method = 't.test',method.args = list(alternative='greater'))
#ggsave(filename = './output/KRAS_ID2_present_vs_absent_latency.pdf',width = 5,height = 6,device = cairo_pdf)
# Latency difference between AS_N and EU_N in the EGFR mutant group --------------------------
my_comparisons <- list(c("AS_N",'EU_N'))
MRCAdata %>%
filter(acceleration == '1x') %>%
left_join(id2data) %>%
left_join(
sherlock_data_full %>%
filter(Gene=='EGFR',Type=='Mutation_Driver') %>%
select(Tumor_Barcode,EGFR=Alteration) %>%
mutate(EGFR=if_else(EGFR=='Yes','EGFR mutant','EGFR wildtype'))
) %>%
left_join(sp_group_data2) %>%
filter(Tumor_Barcode %in% hq_samples2, SP_Group_New %in% c('AS_N','EU_N'),!is.na(Latency)) %>%
ggplot(aes(SP_Group_New,Latency,fill=SP_Group_New))+
geom_boxplot(width=0.6,outlier.shape = NA,alpha =0.25)+
geom_point(pch=21,size=3,position = position_jitter(width = 0.15,height = 0),color="black",stroke=0.02)+
facet_wrap(~EGFR)+
scale_fill_manual(values = sp_group_color_new)+
scale_y_continuous(breaks = pretty_breaks(n = 7))+
theme_ipsum_rc(axis_title_just = 'm',axis_title_size = 14)+
theme(plot.margin = margin(4,4,4,4),panel.spacing = unit(0.1,'cm'),strip.text.x = element_text(hjust = 0.5,face = 4),axis.title.x = element_blank())+
labs(y = 'Tumor latency years')+
guides(fill="none")+
panel_border(color = 'black',linetype = 1)+
stat_compare_means(comparisons = my_comparisons)
#ggsave(filename = './output/EGFR_SP_Group_latency.pdf',width = 4,height = 6,device = cairo_pdf)
# SP_group Latency difference----------------------------------------------------------------
my_comparisons <- list( c("AS_N", "EU_N"), c("AS_N", "EU_S"), c("EU_N", "EU_S") )
my_comparisons <- list( c("AS_N", "EU_N"), c("AS_N", "EU_S"), c("EU_N", "EU_S"),c("AS_N", "Others"), c("Others", "EU_S"), c("EU_N", "Others") )
MRCAdata %>%
filter(acceleration == "1x") %>%
group_by(SP_Group) %>%
summarise(Latency =median(Latency,na.rm = T),Age =median(Age,na.rm = T),MRCA_age =median(MRCA_age,na.rm = T))
## # A tibble: 4 × 4
## SP_Group Latency Age MRCA_age
## <chr> <dbl> <dbl> <dbl>
## 1 N_A 16.8 63 43.8
## 2 N_U 14.9 67.9 48.0
## 3 Others 11.0 66.2 52.4
## 4 S_U 14.8 66.6 50.3
MRCAdata %>%
left_join(sp_group_data) %>%
filter(acceleration == "1x") %>%
filter(!is.na(Latency)) %>%
#filter(SP_Group!="Others") %>%
ggplot(aes(SP_Group_New,Latency,fill=SP_Group_New))+
geom_boxplot(width=0.5,outlier.shape = NA,alpha =0.25)+
geom_point(pch=21,size=1.5,position = position_jitter(width = 0.15,height = 0.05),color="black")+
scale_fill_manual(values = sp_group_color_new)+
theme_ipsum_rc(axis_title_just = 'm',axis_title_size = 14)+
labs(x = "", y = 'Latency, years before diagnosis')+
guides(fill="none")+
panel_border(color = 'black',linetype = 1)+
stat_compare_means(comparisons = my_comparisons)
#ggsave(filename = './output/latency_spgroup_1x.pdf',width = 4,height = 6,device = cairo_pdf)
MRCAdata %>%
left_join(sp_group_data) %>%
filter(acceleration == "1x") %>%
#filter((SP_Group %in% c('AS_N','EU_N') & acceleration == "1x")|(SP_Group %in% c('EU_S') & acceleration == "7.5x")) %>%
filter(!is.na(Latency)) %>%
#filter(SP_Group!="Others") %>%
ggplot(aes(SP_Group_New,MRCA_age,fill=SP_Group_New))+
geom_boxplot(width=0.5,outlier.shape = NA,alpha =0.25)+
geom_point(pch=21,size=1.5,position = position_jitter(width = 0.15,height = 0),color="black")+
scale_fill_manual(values = sp_group_color_new)+
theme_ipsum_rc(axis_title_just = 'm',axis_title_size = 14)+
labs(x = "", y = 'MRCA Age (years)')+
guides(fill="none")+
panel_border(color = 'black',linetype = 1)+
stat_compare_means(comparisons = my_comparisons)#+
#stat_compare_means(label.y = 100)
#ggsave(filename = './output/MRCA_spgroup.pdf',width = 4,height = 6,device = cairo_pdf)
MRCAdata %>%
left_join(sp_group_data) %>%
filter(acceleration == "1x") %>%
#filter((SP_Group %in% c('AS_N','EU_N') & acceleration == "1x")|(SP_Group %in% c('EU_S') & acceleration == "7.5x")) %>%
filter(!is.na(Latency)) %>%
#filter(SP_Group!="Others") %>%
ggplot(aes(SP_Group_New,Age,fill=SP_Group_New))+
geom_boxplot(width=0.5,outlier.shape = NA,alpha =0.25)+
geom_point(pch=21,size=1.5,position = position_jitter(width = 0.15,height = 0),color="black")+
scale_fill_manual(values = sp_group_color_new)+
theme_ipsum_rc(axis_title_just = 'm',axis_title_size = 14)+
labs(x = "", y = 'Age at diagnosis (years)')+
guides(fill="none")+
panel_border(color = 'black',linetype = 1)+
stat_compare_means(comparisons = my_comparisons)
#ggsave(filename = './output/MRCA_spgroup_age.pdf',width = 4,height = 6,device = cairo_pdf)
MRCAdata %>%
left_join(sp_group_data) %>%
filter(acceleration == "1x") %>%
#filter((SP_Group %in% c('AS_N','EU_N') & acceleration == "1x")|(SP_Group %in% c('EU_S') & acceleration == "5x")) %>%
filter(!is.na(Latency)) %>%
filter(SP_Group!="Others") %>%
filter(SP_Group!='N_U') %>%
do(tidy(wilcox.test(MRCA_age~SP_Group,data=.)))
## # A tibble: 1 × 4
## statistic p.value method alternative
## <dbl> <dbl> <chr> <chr>
## 1 7777 0.00103 Wilcoxon rank sum test with continuity correcti… two.sided
# WGD latency -------------------------------------------------------------
my_comparisons <- list( c("WGD", "nWGD"))
MRCAdata %>%
left_join(sp_group_data) %>%
filter(acceleration == "1x") %>%
filter(!is.na(Latency)) %>%
filter(SP_Group!="Others") %>%
left_join(BBsolution4 %>% select(Tumor_Barcode,WGD_Status)) %>%
ggplot(aes(WGD_Status,Latency,fill=WGD_Status))+
geom_boxplot(width=0.5,outlier.shape = NA,alpha =0.25)+
geom_point(pch=21,size=1.5,position = position_jitter(width = 0.15,height = 0.05),color="black")+
facet_wrap(~SP_Group_New)+
scale_fill_jama()+
theme_ipsum_rc(axis_title_just = 'm',axis_title_size = 14)+
theme(panel.spacing = unit(0.1,"cm"))+
labs(x = "", y = 'Latency, years before diagnosis')+
guides(fill="none")+
panel_border(color = 'black',linetype = 1)+
stat_compare_means(comparisons = my_comparisons)
#ggsave(filename = './output/latency_WGD_spgroup.pdf',width = 5,height = 6,device = cairo_pdf)
sessionInfo()
## R version 4.3.2 (2023-10-31)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS 15.5
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.11.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ggpubr_0.6.1 ggsci_3.2.0 broom_1.0.8 ggpmisc_0.6.2
## [5] ggpp_0.5.9 ggasym_0.1.6 data.table_1.17.8 ggnewscale_0.5.2
## [9] ggrepel_0.9.6 cowplot_1.2.0 hrbrthemes_0.8.7 scales_1.4.0
## [13] ggsankey_0.0.99999 lubridate_1.9.4 forcats_1.0.0 stringr_1.5.1
## [17] dplyr_1.1.4 purrr_1.0.4 readr_2.1.5 tidyr_1.3.1
## [21] tibble_3.3.0 ggplot2_3.5.2 tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.6 xfun_0.52 bslib_0.9.0
## [4] rstatix_0.7.2 lattice_0.22-7 tzdb_0.5.0
## [7] vctrs_0.6.5 tools_4.3.2 generics_0.1.4
## [10] pkgconfig_2.0.3 Matrix_1.6-5 RColorBrewer_1.1-3
## [13] lifecycle_1.0.4 compiler_4.3.2 farver_2.1.2
## [16] MatrixModels_0.5-4 carData_3.0-5 SparseM_1.84-2
## [19] fontquiver_0.2.1 fontLiberation_0.1.0 quantreg_6.1
## [22] htmltools_0.5.8.1 sass_0.4.10 yaml_2.3.10
## [25] Formula_1.2-5 Rttf2pt1_1.3.12 car_3.1-3
## [28] pillar_1.11.0 jquerylib_0.1.4 extrafontdb_1.0
## [31] MASS_7.3-60.0.1 cachem_1.1.0 mg14_0.0.6
## [34] abind_1.4-8 nlme_3.1-168 fontBitstreamVera_0.1.1
## [37] tidyselect_1.2.1 digest_0.6.37 stringi_1.8.7
## [40] labeling_0.4.3 splines_4.3.2 extrafont_0.19
## [43] fastmap_1.2.0 grid_4.3.2 cli_3.6.5
## [46] magrittr_2.0.3 utf8_1.2.6 dichromat_2.0-0.1
## [49] survival_3.8-3 withr_3.0.2 backports_1.5.0
## [52] gdtools_0.4.2 timechange_0.3.0 rmarkdown_2.29
## [55] ggsignif_0.6.4 hms_1.1.3 evaluate_1.0.4
## [58] knitr_1.50 viridisLite_0.4.2 mgcv_1.9-1
## [61] rlang_1.1.6 Rcpp_1.1.0 glue_1.8.0
## [64] polynom_1.4-1 rstudioapi_0.17.1 jsonlite_2.0.0
## [67] R6_2.6.1 systemfonts_1.2.3